{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 高校动态"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from requests_html import HTMLSession\n",
    "import requests_html\n",
    "import pandas as pd\n",
    "import urllib.parse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "200"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "session = HTMLSession()\n",
    "r = session.get(\"https://www.nfu.edu.cn/gjdt/index.htm\")\n",
    "r.status_code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 存\n",
    "with open (\"html_out/_nfu_gjdt.html\", encoding = \"utf8\", mode = \"w\") as fp:\n",
    "    fp.write(r.html.html)\n",
    "# 读\n",
    "with open (\"html_out/_nfu_gjdt.html\", encoding = \"utf8\", mode = \"r\") as fp:\n",
    "    html_load = fp.read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Element html at 0x25f4a559450>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 解析\n",
    "parsed = requests_html.soup_parse(html_load)\n",
    "parsed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ParseResult(scheme='https', netloc='www.nfu.edu.cn', path='/gjdt/index.htm', params='', query='', fragment='')"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 解析\n",
    "base_url = r.url\n",
    "nfu_urlparse = urllib.parse.urlparse(base_url)\n",
    "nfu_urlparse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/gjdt/309be8b078444044b51624f0e186729e.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/159b20971f8b4051ba7cbbc80e65b871.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/27ba495edc1b49f88bcebb750c5dcc33.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/20dc120c250642cca5815c93591bb5cb.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/b43531427fb44695bbb0e16280988965.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/1509f4f3bc2f4babbe57c7ec29854807.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/a4b2fb3dacae4564976e9a951bcddcff.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/0e7d664c116f4a0ab9e6d165977f9def.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/71e152ddce12414388346126ba1a1b6b.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/0d7bfc95f70841a6b5c2ad85f74c3510.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/750c396e278446e687a42965b1c9a385.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/cd3ecf8986ad40e991e37f635f1ab8b4.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/3a7835bded2441aeb67bbc6f2a61471c.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/4d0eb3a8b8ee47f6b9bf0e9fda26fa4a.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/355e3d6207974a3ea62ef78d2ecc2f23.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/9b6486d83f454a2ca7a0267169ca534d.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/6bb7172f46b3458b8022b1132593b96b.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/a8f6f2c6a2c644d2998d7836f8151f58.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/f978598b61024bb2982f5fbb32f81b9d.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/3d9e9dfac15945938692c88167347ee7.htm']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 重组链接\n",
    "list_URL  = [urllib.parse.urlunparse\\\n",
    "([nfu_urlparse.scheme,nfu_urlparse.netloc,'/'+ nfu_urlparse.path.split('/')[1] +'/' + detail_url,'','',''])\\\n",
    "for detail_url in parsed.xpath('//div[@class=\"news_title\"]/a/@href')]\n",
    "list_URL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>标题</th>\n",
       "      <th>链接</th>\n",
       "      <th>日期</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>教育部党组《求是》撰文：精心谋划 切实抓好教育系统党史学习教育</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/309be8b078444044b5...</td>\n",
       "      <td>2021-04-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>教育部长陈宝生：把巩固拓展作为开局之年工作主题，做到6个到位</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/159b20971f8b4051ba...</td>\n",
       "      <td>2021-03-20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>如何建设高质量教育体系？“十四五”规划和2035年远景目标纲要明确了</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/27ba495edc1b49f88b...</td>\n",
       "      <td>2021-03-15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>教育部长陈宝生《旗帜》撰文：建设高质量教育体系，加快建成教育强国</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/20dc120c250642cca5...</td>\n",
       "      <td>2021-01-05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>重磅！《推进粤港澳大湾区高等教育合作发展规划》正式印发</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/b43531427fb44695bb...</td>\n",
       "      <td>2020-12-22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>教育部长陈宝生：大力提升青少年宪法法治教育质量</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/1509f4f3bc2f4babbe...</td>\n",
       "      <td>2020-12-15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>教育系统如何学习贯彻五中全会精神？教育部最新通知来了</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/a4b2fb3dacae456497...</td>\n",
       "      <td>2020-11-20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>教育部长陈宝生撰文：建设高质量教育体系  五中全会深解读</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/0e7d664c116f4a0ab9...</td>\n",
       "      <td>2020-11-12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>教育部长陈宝生：教育系统要找准学习领会切入点与维度，做到6个“理解”|五中全会大学习</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/71e152ddce12414388...</td>\n",
       "      <td>2020-11-06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>教育战线如何贯彻落实五中全会精神？教育部最新部署来了</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/0d7bfc95f70841a6b5...</td>\n",
       "      <td>2020-11-04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>如何深化新时代教育评价改革？教育部11问答详解（附一图看懂）</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/750c396e278446e687...</td>\n",
       "      <td>2020-10-22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>中共中央 国务院印发《深化新时代教育评价改革总体方案》</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/cd3ecf8986ad40e991...</td>\n",
       "      <td>2020-10-22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>重磅！中办、国办印发《关于全面加强和改进新时代学校体育工作的意见》和《关于全面加强和改进新时...</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/3a7835bded2441aeb6...</td>\n",
       "      <td>2020-10-21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>钟秉林、张志勇、沈炜解读：教育评价“指挥棒”全面转向！</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/4d0eb3a8b8ee47f6b9...</td>\n",
       "      <td>2020-10-20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>教育部党组：开启全面建设高素质专业化创新型教师队伍新征程</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/355e3d6207974a3ea6...</td>\n",
       "      <td>2020-10-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>教育部长陈宝生《求是》撰文：新时代建设教育强国的根本指针</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/9b6486d83f454a2ca7...</td>\n",
       "      <td>2020-09-22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>如何学习贯彻习近平总书记教师节重要寄语？教育部最新通知来了</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/6bb7172f46b3458b80...</td>\n",
       "      <td>2020-09-12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>习近平主持召开科学家座谈会强调不断向科学技术广度和深度进军</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/a8f6f2c6a2c644d299...</td>\n",
       "      <td>2020-09-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>《求是》杂志发表习近平总书记重要文章</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/f978598b61024bb298...</td>\n",
       "      <td>2020-08-31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>来听！教育部党组成员这样讲专题党课</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/3d9e9dfac159459386...</td>\n",
       "      <td>2020-08-28</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   标题  \\\n",
       "0                     教育部党组《求是》撰文：精心谋划 切实抓好教育系统党史学习教育   \n",
       "1                      教育部长陈宝生：把巩固拓展作为开局之年工作主题，做到6个到位   \n",
       "2                  如何建设高质量教育体系？“十四五”规划和2035年远景目标纲要明确了   \n",
       "3                    教育部长陈宝生《旗帜》撰文：建设高质量教育体系，加快建成教育强国   \n",
       "4                         重磅！《推进粤港澳大湾区高等教育合作发展规划》正式印发   \n",
       "5                             教育部长陈宝生：大力提升青少年宪法法治教育质量   \n",
       "6                          教育系统如何学习贯彻五中全会精神？教育部最新通知来了   \n",
       "7                        教育部长陈宝生撰文：建设高质量教育体系  五中全会深解读   \n",
       "8          教育部长陈宝生：教育系统要找准学习领会切入点与维度，做到6个“理解”|五中全会大学习   \n",
       "9                          教育战线如何贯彻落实五中全会精神？教育部最新部署来了   \n",
       "10                     如何深化新时代教育评价改革？教育部11问答详解（附一图看懂）   \n",
       "11                        中共中央 国务院印发《深化新时代教育评价改革总体方案》   \n",
       "12  重磅！中办、国办印发《关于全面加强和改进新时代学校体育工作的意见》和《关于全面加强和改进新时...   \n",
       "13                        钟秉林、张志勇、沈炜解读：教育评价“指挥棒”全面转向！   \n",
       "14                       教育部党组：开启全面建设高素质专业化创新型教师队伍新征程   \n",
       "15                       教育部长陈宝生《求是》撰文：新时代建设教育强国的根本指针   \n",
       "16                      如何学习贯彻习近平总书记教师节重要寄语？教育部最新通知来了   \n",
       "17                      习近平主持召开科学家座谈会强调不断向科学技术广度和深度进军   \n",
       "18                                 《求是》杂志发表习近平总书记重要文章   \n",
       "19                                  来听！教育部党组成员这样讲专题党课   \n",
       "\n",
       "                                                   链接          日期  \n",
       "0   https://www.nfu.edu.cn/gjdt/309be8b078444044b5...  2021-04-08  \n",
       "1   https://www.nfu.edu.cn/gjdt/159b20971f8b4051ba...  2021-03-20  \n",
       "2   https://www.nfu.edu.cn/gjdt/27ba495edc1b49f88b...  2021-03-15  \n",
       "3   https://www.nfu.edu.cn/gjdt/20dc120c250642cca5...  2021-01-05  \n",
       "4   https://www.nfu.edu.cn/gjdt/b43531427fb44695bb...  2020-12-22  \n",
       "5   https://www.nfu.edu.cn/gjdt/1509f4f3bc2f4babbe...  2020-12-15  \n",
       "6   https://www.nfu.edu.cn/gjdt/a4b2fb3dacae456497...  2020-11-20  \n",
       "7   https://www.nfu.edu.cn/gjdt/0e7d664c116f4a0ab9...  2020-11-12  \n",
       "8   https://www.nfu.edu.cn/gjdt/71e152ddce12414388...  2020-11-06  \n",
       "9   https://www.nfu.edu.cn/gjdt/0d7bfc95f70841a6b5...  2020-11-04  \n",
       "10  https://www.nfu.edu.cn/gjdt/750c396e278446e687...  2020-10-22  \n",
       "11  https://www.nfu.edu.cn/gjdt/cd3ecf8986ad40e991...  2020-10-22  \n",
       "12  https://www.nfu.edu.cn/gjdt/3a7835bded2441aeb6...  2020-10-21  \n",
       "13  https://www.nfu.edu.cn/gjdt/4d0eb3a8b8ee47f6b9...  2020-10-20  \n",
       "14  https://www.nfu.edu.cn/gjdt/355e3d6207974a3ea6...  2020-10-10  \n",
       "15  https://www.nfu.edu.cn/gjdt/9b6486d83f454a2ca7...  2020-09-22  \n",
       "16  https://www.nfu.edu.cn/gjdt/6bb7172f46b3458b80...  2020-09-12  \n",
       "17  https://www.nfu.edu.cn/gjdt/a8f6f2c6a2c644d299...  2020-09-11  \n",
       "18  https://www.nfu.edu.cn/gjdt/f978598b61024bb298...  2020-08-31  \n",
       "19  https://www.nfu.edu.cn/gjdt/3d9e9dfac159459386...  2020-08-28  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 输出结果\n",
    "# B-D-1 pd.DataFrame 建构，pandas课有教\n",
    "df = pd.DataFrame( {\n",
    "         \"标题\": parsed.xpath('//div[@class=\"news_title\"]/a/@title'),\n",
    "         \"链接\": list_URL,\n",
    "         \"日期\": parsed.xpath('//font[@class=\"right-more\"]/text()'),\n",
    "     } )\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_excel(\"data_out/nfu_gjdt.xlsx\", sheet_name=\"检索结果\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "26\n"
     ]
    }
   ],
   "source": [
    "for i in range(1,100):\n",
    "    r = session.get('https://www.nfu.edu.cn/gjdt/index'+str(i)+'.htm')\n",
    "    if r.status_code != 200:\n",
    "        print(i)\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/gjdt/index.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index1.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index2.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index3.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index4.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index5.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index6.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index7.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index8.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index9.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index10.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index11.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index12.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index13.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index14.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index15.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index16.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index17.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index18.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index19.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index20.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index21.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index22.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index23.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index24.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index25.htm']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "url_group = ['https://www.nfu.edu.cn/gjdt/index'+str(i)+'.htm' for i in range(1,26)]\n",
    "url_group.insert(0,'https://www.nfu.edu.cn/gjdt/index.htm')\n",
    "url_group"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "for url in url_group:\n",
    "    r = session.get(url)\n",
    "#     print(r.html.html)\n",
    "#     设置存档HTML的名称\n",
    "    path = urllib.parse.urlparse(url).path\n",
    "    with open ('html_out/'+path, encoding = \"utf8\", mode = \"w\") as fp:\n",
    "        fp.write(r.html.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# xpath 准备：\n",
    "dict_xpath = {\n",
    "    '链接_xpath':'//div[@class=\"news_title\"]/a/@href',\n",
    "    '标题_xpath':'//div[@class=\"news_title\"]/a/@title',\n",
    "    '日期_xpath':'//font[@class=\"right-more\"]/text()'\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def pages_content_url(parsed):\n",
    "    list_URL  = [urllib.parse.urlunparse\\\n",
    "                 ([nfu_urlparse.scheme,nfu_urlparse.netloc,'/'+ nfu_urlparse.path.split('/')[1] +'/' + detail_url,'','',''])\\\n",
    "                 for detail_url in parsed.xpath(dict_xpath['链接_xpath'])]\n",
    "    return list_URL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['index.htm', 'index1.htm', 'index10.htm', 'index11.htm', 'index12.htm', 'index13.htm', 'index14.htm', 'index15.htm', 'index16.htm', 'index17.htm', 'index18.htm', 'index19.htm', 'index2.htm', 'index20.htm', 'index21.htm', 'index22.htm', 'index23.htm', 'index24.htm', 'index25.htm', 'index3.htm', 'index4.htm', 'index5.htm', 'index6.htm', 'index7.htm', 'index8.htm', 'index9.htm']\n"
     ]
    }
   ],
   "source": [
    "import os     # 读取html文件\n",
    "\n",
    "list_df = []\n",
    "\n",
    "\n",
    "files= os.listdir('html_out/gjdt/')\n",
    "print(files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['index.htm', 'index1.htm', 'index10.htm', 'index11.htm', 'index12.htm', 'index13.htm', 'index14.htm', 'index15.htm', 'index16.htm', 'index17.htm', 'index18.htm', 'index19.htm', 'index2.htm', 'index20.htm', 'index21.htm', 'index22.htm', 'index23.htm', 'index24.htm', 'index25.htm', 'index3.htm', 'index4.htm', 'index5.htm', 'index6.htm', 'index7.htm', 'index8.htm', 'index9.htm']\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>标题</th>\n",
       "      <th>链结</th>\n",
       "      <th>日期</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>教育部党组《求是》撰文：精心谋划 切实抓好教育系统党史学习教育</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/309be8b078444044b5...</td>\n",
       "      <td>2021-04-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>教育部长陈宝生：把巩固拓展作为开局之年工作主题，做到6个到位</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/159b20971f8b4051ba...</td>\n",
       "      <td>2021-03-20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>如何建设高质量教育体系？“十四五”规划和2035年远景目标纲要明确了</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/27ba495edc1b49f88b...</td>\n",
       "      <td>2021-03-15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>教育部长陈宝生《旗帜》撰文：建设高质量教育体系，加快建成教育强国</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/20dc120c250642cca5...</td>\n",
       "      <td>2021-01-05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>重磅！《推进粤港澳大湾区高等教育合作发展规划》正式印发</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/b43531427fb44695bb...</td>\n",
       "      <td>2020-12-22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>368</th>\n",
       "      <td>8</td>\n",
       "      <td>广东省教育厅：今年毕业生就业形势比去年好</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/3829e4c5df9e460abc...</td>\n",
       "      <td>2014-03-28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>369</th>\n",
       "      <td>9</td>\n",
       "      <td>要求职业“高大上” 高校毕业生择业扎堆致就业难</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/776ebc41fae84b36a4...</td>\n",
       "      <td>2014-03-27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>370</th>\n",
       "      <td>10</td>\n",
       "      <td>教育部：预计今年贫困地区农村学生上重点高校的人数将比去年增加10%以上</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/41d339ccb3a0464c9c...</td>\n",
       "      <td>2014-03-25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>371</th>\n",
       "      <td>11</td>\n",
       "      <td>学位论文如何才能挤出“水分”</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/1e8fa309bcf847b6ad...</td>\n",
       "      <td>2014-03-24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>372</th>\n",
       "      <td>12</td>\n",
       "      <td>高校低年级学生频繁试水招聘会 专家：鼓励提前预热</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/3f34245a7cb449c99b...</td>\n",
       "      <td>2013-03-31</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>513 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     index                                   标题  \\\n",
       "0        0      教育部党组《求是》撰文：精心谋划 切实抓好教育系统党史学习教育   \n",
       "1        1       教育部长陈宝生：把巩固拓展作为开局之年工作主题，做到6个到位   \n",
       "2        2   如何建设高质量教育体系？“十四五”规划和2035年远景目标纲要明确了   \n",
       "3        3     教育部长陈宝生《旗帜》撰文：建设高质量教育体系，加快建成教育强国   \n",
       "4        4          重磅！《推进粤港澳大湾区高等教育合作发展规划》正式印发   \n",
       "..     ...                                  ...   \n",
       "368      8                 广东省教育厅：今年毕业生就业形势比去年好   \n",
       "369      9              要求职业“高大上” 高校毕业生择业扎堆致就业难   \n",
       "370     10  教育部：预计今年贫困地区农村学生上重点高校的人数将比去年增加10%以上   \n",
       "371     11                       学位论文如何才能挤出“水分”   \n",
       "372     12             高校低年级学生频繁试水招聘会 专家：鼓励提前预热   \n",
       "\n",
       "                                                    链结          日期  \n",
       "0    https://www.nfu.edu.cn/gjdt/309be8b078444044b5...  2021-04-08  \n",
       "1    https://www.nfu.edu.cn/gjdt/159b20971f8b4051ba...  2021-03-20  \n",
       "2    https://www.nfu.edu.cn/gjdt/27ba495edc1b49f88b...  2021-03-15  \n",
       "3    https://www.nfu.edu.cn/gjdt/20dc120c250642cca5...  2021-01-05  \n",
       "4    https://www.nfu.edu.cn/gjdt/b43531427fb44695bb...  2020-12-22  \n",
       "..                                                 ...         ...  \n",
       "368  https://www.nfu.edu.cn/gjdt/3829e4c5df9e460abc...  2014-03-28  \n",
       "369  https://www.nfu.edu.cn/gjdt/776ebc41fae84b36a4...  2014-03-27  \n",
       "370  https://www.nfu.edu.cn/gjdt/41d339ccb3a0464c9c...  2014-03-25  \n",
       "371  https://www.nfu.edu.cn/gjdt/1e8fa309bcf847b6ad...  2014-03-24  \n",
       "372  https://www.nfu.edu.cn/gjdt/3f34245a7cb449c99b...  2013-03-31  \n",
       "\n",
       "[513 rows x 4 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "list_df = []\n",
    "\n",
    "# os listdir取出某文件夹中的文件的名称组成列表\n",
    "files= os.listdir('html_out/gjdt/')\n",
    "print(files)\n",
    "\n",
    "for html in files:\n",
    "    with open('html_out/gjdt/'+html,encoding='utf8',mode='r') as fp:\n",
    "        html_load = fp.read()\n",
    "        parsed = requests_html.soup_parse(html_load)\n",
    "        list_URL = pages_content_url(parsed)\n",
    "        \n",
    "        df = pd.DataFrame( {\n",
    "         \"标题\": parsed.xpath(dict_xpath['标题_xpath']),\n",
    "         \"链结\": list_URL,\n",
    "         \"日期\": parsed.xpath(dict_xpath['日期_xpath']),\n",
    "        } )\n",
    "        list_df.append(df)\n",
    "\n",
    "\n",
    "# 凭借表格pandas 方法concat\n",
    "# 数据分析的方法 concat拼接DataFrame  sort_values排序\n",
    "df_all = pd.concat(list_df).reset_index().sort_values(by='日期',ascending=False)\n",
    "display(df_all)    \n",
    "\n",
    "with pd.ExcelWriter('data_out/nfu_高教动态.xlsx',mode='w',engine=\"openpyxl\") as writer:  \n",
    "            df_all.to_excel(writer, sheet_name='高教动态')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
